图聚类常用函数和数据集 1. 评价函数eval


点击查看代码 import numpy as np from munkres import Munkres, print_matrix from sklearn.metrics.cluster import normalized_mutual_info_score as nmi_score from sklearn.metrics import adjusted_rand_score as ari_score from scipy.optimize import linear_sum_assignment as linear from sklearn import metrics def cluster_acc(y_true, y_pred): y_true = y_true - np.min(y_true) l1 = list(set(y_true)) numclass1 = len(l1) l2 = list(set(y_pred)) numclass2 = len(l2) ind = 0 if numclass1 != numclass2: for i in l1: if i in l2: pass else: y_pred[ind] = i ind += 1 l2 = list(set(y_pred)) numclass2 = len(l2) if numclass1 != numclass2: print('error') return cost = np.zeros((numclass1, numclass2), dtype=int) for i, c1 in enumerate(l1): mps = [i1 for i1, e1 in enumerate(y_true) if e1 == c1] for j, c2 in enumerate(l2): mps_d = [i1 for i1 in mps if y_pred[i1] == c2] cost[i][j] = len(mps_d) # match two clustering results by Munkres algorithm m = Munkres() cost = cost.__neg__().tolist() indexes = m.compute(cost) # get the match results new_predict = np.zeros(len(y_pred)) for i, c in enumerate(l1): # correponding label in l2: c2 = l2[indexes[i][1]] # ai is the index with label==c2 in the pred_label list ai = [ind for ind, elm in enumerate(y_pred) if elm == c2] new_predict[ai] = c acc = metrics.accuracy_score(y_true, new_predict) f1_macro = metrics.f1_score(y_true, new_predict, average='macro') precision_macro = metrics.precision_score(y_true, new_predict, average='macro') recall_macro = metrics.recall_score(y_true, new_predict, average='macro') f1_micro = metrics.f1_score(y_true, new_predict, average='micro') precision_micro = metrics.precision_score(y_true, new_predict, average='micro') recall_micro = metrics.recall_score(y_true, new_predict, average='micro') return acc, f1_macro def eva(y_true, y_pred, epoch='0'): acc, f1 = cluster_acc(y_true, y_pred) nmi = nmi_score(y_true, y_pred, average_method='arithmetic') ari = ari_score(y_true, y_pred) print("第"+epoch, ':acc {:.4f}'.format(acc), ', nmi {:.4f}'.format(nmi), ', ari {:.4f}'.format(ari), ', f1 {:.4f}'.format(f1)) return acc,f1,nmi,ari 2.加载数据集(txt)


点击查看代码 # 当数据集是txt:load 加载函数 import numpy as np import torch from torch.utils.data import Dataset import scipy.sparse as sp # 接口介绍: Dataset ,DataLoader 是pytorch 提供预处理和加载数据的两个接口 """ 先把原始数据转变成 torch.utils.data.Dataset 类, 随后再把得到的 torch.utils.data.Dataset 类, 当作一个参数传递给 torch.utils.data.DataLoader 类, 得到一个数据加载器,这个数据加载器每次可以返回一个 Batch 的数据供模型训练使用。 参考连接:https://blog.csdn.net/weixin_44211968/article/details/123744513 https://www.jb51.net/article/252552.htm """ class LoadDataset(Dataset): """ 自己定义的 dataset 类需要继承 Dataset。 需要实现必要的魔法方法: 在 __init__ 方法里面进行 读取数据文件 。 在 __getitem__ 方法里支持通过下标访问数据。 在 __len__ 方法里返回自定义数据集的大小,方便后期遍历。 """ def __init__(self, data): self.x = data def __len__(self): return self.x.shape[0] # 节点数量 def __getitem__(self, idx): return torch.from_numpy(np.array(self.x[idx])).float(), \ torch.from_numpy(np.array(idx)) def load_graph(k=False, graph_k_save_path="", graph_save_path="", data_path=""): """ 功能: 通过传入数据的graph.txt 和属性矩阵.txt 构建邻接矩阵adj:A :param k: 表示非图结构的数据 标志 true :非图数据结构 false : 图数据结构 :param graph_k_save_path: 非图数据结构 graph.txt 文件路径 :param graph_save_path: 图数据结构 graph.txt 文件路径 :param data_path: : 特征(属性)矩阵 文件路径 :return: 返回 结构矩阵adj :A """ if k: path = graph_k_save_path print("加载非图数据结构的graph.txt路径:",path) else: path = graph_save_path print("加载图数据结构的graph.txt路径:", path) data = np.loadtxt(data_path, dtype=float)#读取属性矩阵 列如:cite.txt n, m= data.shape# 返回 n:节点数目 _m: 特征维度 idx = np.array([i for i in range(n)], dtype=np.int32) idx_map = {j: i for i, j in enumerate(idx)} edges_unordered = np.genfromtxt(path, dtype=np.int32)# 读取graph.txt 文件 edges = np.array(list(map(idx_map.get, edges_unordered.flatten())), dtype=np.int32).reshape(edges_unordered.shape) adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])), shape=(n, n), dtype=np.float32) adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj) adj = normalize(adj)# 归一化操作 这个操作换成GcnNormalize adj = sparse_mx_to_torch_sparse_tensor(adj) return adj """ 代码中采用的归一化 gcn操作有两种: 第一种就是行归一化 第二中就是gcn中的归一化 """ def normalize(adj): """ 功能:行归一化 :param mx: 邻接矩阵adj :A :return: 返回行归一化邻接矩阵adj """ adj = adj + sp.eye(adj.shape[0])# A+IN sp.eye 和np.eye 等价 # 归一化操作:就是采用这种行归一化操作 rowsum = np.array(adj.sum(1)) # 求度矩阵D r_inv = np.power(rowsum, -1).flatten()# D^-1 r_inv[np.isinf(r_inv)] = 0.# 将一些计算得到的NAN值赋0值 r_mat_inv = sp.diags(r_inv)#D^-1对角化 adj = r_mat_inv.dot(adj)#D^-1 A #adj=adj.dot(r_mat_inv)D^-1 A D^-1 gcn中的归一化但是一般采用的都是行归一化 return adj def sparse_mx_to_torch_sparse_tensor(sparse_mx): """ 将scipy稀疏矩阵转换为torch稀疏张量 :param sparse_mx:稀疏矩阵 :return:torch稀疏张量 """ sparse_mx = sparse_mx.tocoo().astype(np.float32) indices = torch.from_numpy( np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64)) values = torch.from_numpy(sparse_mx.data) shape = torch.Size(sparse_mx.shape) return torch.sparse.FloatTensor(indices, values, shape) """ scipy.sparse 矩阵转numpy https://www.codenong.com/26576524/ """ if __name__ == '__main__': x = np.loadtxt('data/cite.txt', dtype=float)#图属性矩阵(特征矩阵):X 3327 *3703 3327个节点,每个节点的特征是3703维 y = np.loadtxt('data/cite_label.txt', dtype=int)# 返回标签:y 即label:y 返回上面 每一个节点对应的标签 dataset=LoadDataset(x)# 通过自定义的LoadDataset 去加载数据 dataset.x 是属性图矩阵 # 返回 adj=load_graph(k=False, graph_k_save_path="", graph_save_path="./graph/cite_graph.txt", data_path="./data/cite.txt") 3.训练指标可视化


点击查看代码 import matplotlib.pyplot as plt import numpy as np import pyecharts.options as opts from pyecharts.charts import Line from pyecharts.globals import ThemeType def EpochVision( x,y,modelname,arg,epoch='1'): a = np.array(x) b = np.array(y) plt.plot(a, b) plt.title(modelname+": "+arg) plt.xlabel('Epochs') plt.ylabel(arg) plt.savefig("./view/"+arg+epoch+".jpg") plt.close() def EpochEchart( x,acc,f1,nmi,ari,modelname="模型名字",iters='1'): """ 函数功能: 把训练模型的acc,f1,nmi,ari 化成图,以html 的形式展示出来 :param x: 横坐标 纵坐标 :param acc: 准确率 越大越好 :param f1: 召回率 :param nmi: 调整回信息 越大越好 :param ari: 调整兰德指数 越大越好 :param modelname: 模型名字 :param arg: 调节模型中参数 :param iters: 表示第几次运行模型 :return: """ line = ( Line( init_opts=opts.InitOpts(width="1600px", height="700px", page_title="可视化epocch训练指标", theme=ThemeType.DARK) ) .set_global_opts( tooltip_opts=opts.TooltipOpts(is_show=False), xaxis_opts=opts.AxisOpts(type_="category"), yaxis_opts=opts.AxisOpts( type_="value", axistick_opts=opts.AxisTickOpts(is_show=True), splitline_opts=opts.SplitLineOpts(is_show=True), ), ) .add_xaxis(xaxis_data=x) .add_yaxis( series_name=modelname+" : "+"ACC", y_axis=acc, symbol="emptyCircle", is_symbol_show=True, is_smooth=True, label_opts=opts.LabelOpts(is_show=True), ) .add_yaxis( series_name=modelname + " : " + "F1", y_axis=f1, symbol="emptyCircle", is_symbol_show=True, is_smooth=True, label_opts=opts.LabelOpts(is_show=True), ) .add_yaxis( series_name=modelname + " : " + "NMI", y_axis=nmi, symbol="emptyCircle", is_symbol_show=True, is_smooth=True, label_opts=opts.LabelOpts(is_show=True), ) .add_yaxis( series_name=modelname + " : " + "ARI", y_axis=ari, symbol="emptyCircle", is_symbol_show=True, is_smooth=True, label_opts=opts.LabelOpts(is_show=True), ) .render("./"+modelname+iters+".html") ) if __name__ == '__main__': x, y = np.arange(0.4, 0.7, .03), np.arange(0, 1, .05) 4.可视化聚类效果


点击查看代码 import numpy as np import seaborn as sns import matplotlib.pyplot as plt from sklearn.manifold import TSNE def t_sne(embeds, labels, sample_num=2000, show_fig=True,device=True): """ visualize embedding by t-SNE algorithm :param embeds: embedding of the data: 嵌入向量 :param labels: labels: 标签 :param sample_num: the num of samples: 样本数量 :param show_fig: if show the figure:是否展示图 :param device: True :cpu False :cuda gpu :return fig: figure """ if not device: embeds=embeds.cpu() # sampling sample_index = np.random.randint(0, embeds.shape[0], sample_num) sample_embeds = embeds[sample_index] sample_labels = labels[sample_index] # t-SNE ts = TSNE(n_components=2, init='pca', random_state=0) ts_embeds = ts.fit_transform(sample_embeds[:, :]) # remove outlier mean, std = np.mean(ts_embeds, axis=0), np.std(ts_embeds, axis=0) for i in range(len(ts_embeds)): if (ts_embeds[i] - mean < 3 * std).all(): np.delete(ts_embeds, i) # normalization x_min, x_max = np.min(ts_embeds, 0), np.max(ts_embeds, 0) norm_ts_embeds = (ts_embeds - x_min) / (x_max - x_min) # plot fig = plt.figure() for i in range(norm_ts_embeds.shape[0]): plt.text(norm_ts_embeds[i, 0], norm_ts_embeds[i, 1], str(sample_labels[i]), color=plt.cm.Set1(sample_labels[i] % 7), fontdict={'weight': 'bold', 'size': 7}) plt.xticks([]) plt.yticks([]) plt.title('t-SNE', fontsize=14) plt.axis('off') if show_fig: plt.show() return fig def similarity_plot(embedding, label, sample_num=1000, show_fig=True,device=True): """ show cosine similarity of embedding or x :param embedding: the input embedding:嵌入向量 :param label: the ground truth: 真实标签 :param sample_num: sample number:样本数量 :param show_fig: if show the figure :return fig: the figure """ if not devic: embedding=embedding.cpu() # sampling label_sample = label[:sample_num] embedding_sample = embedding[:sample_num, :] # sort the embedding based on label cat = np.concatenate([embedding_sample, label_sample.reshape(-1, 1)], axis=1) arg_sort = np.argsort(label_sample) cat = cat[arg_sort] embedding_sample = cat[:, :-1] # cosine similarity norm_embedding_sample = embedding_sample / np.sqrt(np.sum(embedding_sample ** 2, axis=1)).reshape(-1, 1) cosine_sim = np.matmul(norm_embedding_sample, norm_embedding_sample.transpose()) cosine_sim[cosine_sim < 1e-5] = 0 # figure fig = plt.figure() sns.heatmap(data=cosine_sim, cmap="RdBu_r", vmin=-1, vmax=1) plt.axis("off") # plot if show_fig: plt.show() return fig 5.数据集

txt数据集:https://github.com/bdy9527/SDCN npy类型数据集:https://github.com/yueliu1999/Awesome-Deep-Graph-Clustering#benchmark-datasets npy数据处理:https://blog.csdn.net/qq_51392112/article/details/129429108


google 团队网盘申请:https://www.iculture.cc/knowledge/pig=7974

colab使用技巧: http://element-ui.cn/article/show-119360.html?action=onClick https://blog.csdn.net/qq_43684592/article/details/116302893 https://blog.csdn.net/Xuxianmincs/article/details/89601122 colab gpu 训练报错:: https://www.cnblogs.com/seansheep/p/16020753.html https://www.cnblogs.com/booturbo/p/16341650.html https://blog.csdn.net/hshudoudou/article/details/127383111







